load("/workspace/Stanford_EndOfLife_Expenses/EOL_workspace_before_exhibits.RData")


library(tidyverse)
library(tidymodels)
library(ggplot2)
library(workflows)
library(magrittr)
library(knitr)
library(kableExtra)
library(data.table)
library(plotROC)

revision_path = 'path'
source(paste0(revision_path,'data_retrieval.R'))
source(paste0(revision_path,'preprocess.R'))
source(paste0(revision_path,'analysis.R')) # get rds preds - calibration plot
source(paste0(revision_path,'modeling.R'))


parent_directory <- path
source(paste0(parent_directory, "set_themes.R"))
set_themes(base_size = 20)

setwd(path)

days_int <- 3         # determine last adm as such if ended no more than days_int before death
min_obs_num <- 10     # if entry contains less than 10 obs, drop it 
round_par <- 0.05     # determine the phat bins to be weighted on
prof_multi <- 4       # When weighting by wards (profession), multiply round_par in prof_multi factor
inplot_text_size <- 4
base_size  = 20 

# define intensity of hospitalization by ward (based on results of procedures and avg cost per ward)
intensity_wards <- c("oncology","internal_medicine",
                     "geriatry","rehabilitation")

high_intensity_wards <-c( "gastroenterology","neurology"  ,"orthopedic_surgery",
                          "general_surgery" , "ICU" ,  "urology")  

top_All_prof_10 <- c(high_intensity_wards,intensity_wards)

# in this file we saved the original predictions and feature tables: 

# source all functions of analysis : 
source_analysis <- function(func) {source(paste0(parent_directory,
                                                 func,".R"))}

list_of_func<- c("fig_concentration", 
                 "fig_Spending_byPredictedMortality",
                 "fig_MonthlySpending_byPhat_andDeath",
                 "fig_cancer_Decedent_Spending_By_Time",
                 "fig_Average_Monthly_Spending_by_Type",
                 "fig_60_days_after_admission",
                 "fig_60_days_after_admission_events",
                 "fig_60_days_after_admission_revision",
                 "fig_MonthlySpending_byPhat_andDeath_ageQ",
                 "table_1",
                 "table_2",
                 "table_2_matching",
                 "table_2_procedure",
                 "table_3",
                 "tab_Ave_monthly_spending_cancer", #2 & alternatives
                 "table_3_month",
                 "table_4",
                 "table_5_month",
                 "fig_concetration_100_bins", # a1
                 "fig_MonthlySpending_byPhat_andDeath_ageQ_CancerType", # a2,a3,a4
                 "fig_Predictive_Model_Fit",#a5
                 "fig_dyn_boxplot_by_month", #a6
                 "tab_Adm_Char_by_Hospital_Ownership", #a1
                 "tab_top_10_wards_intensity_desc" , #a2
                 "tab_mainTopo_descStats",  #a3
                 "tab_Ave_Monthly_Spending_All", #a4 #a5
                 "tab_Procedures_planned_intensity",# a8
                 "tab_Select_Predictors", #a11 
                 "tab_N_by_sample",  
                 "tab_Alternative_Adm_Grouping", #a14 
                 "build_dt",
                 "build_dyn_new",
                 "get_monthly_cost",
                 "get_adm_data_cancer",
                 "events_tree",
                 "events_describe")
lapply(list_of_func, source_analysis)

# function to load initial prediction of cancer patients 
get_revision_cancer_monthly_data_month <- function(con,month = 'cnr_data') {
  # returns query for monthly subset of revision cancer table
  (get_revision_cancer_data(con)
   %>% filter(S_sample_source_XX %in% c(month))
  )
}

# save model descriptions -----------------------

# collect all AUCs of models to one table 
library(readr)
  read_csv("canc_revision_events_AUC.csv") %>% 
    bind_rows(read_csv("monthly_canc_AUC.csv")) %>% 
    bind_rows(read_csv("canc_revision_all_model_AUC.csv")) %>% 
    bind_rows(read_csv("canc_revision_noEMR_model_AUC.csv")) %>% 
    bind_rows(read_csv("results/p0_canc_AUC.csv"))  %>% 
    write_csv( .,"AUC_all_mods.csv")
  

# save cancer initial prediction feature importance 
initial_predict <- get_load_new(file_name = "revision_p0.RData")

initial_predict$feature_importance[1:50] %>% 
  write.csv("init_feature_importance.csv")

# 
revision_cancer_month1 <- get_revision_cancer_monthly_data_month(con)


pdf( "calib_new_data_new_preds.pdf")
calibration_plot(get_load_new(file_name = "revision_p0.RData")$preds_test,'preds_after_bayes','true_value',20)
dev.off()

# build main tables for analyis - cancer sample: 
new_cnr <- revision_cancer_month1 %>%
  filter(S_index_date_XX < DMG_date_of_death_XX | is.na(DMG_date_of_death_XX)) %>%
  collect()

new_test_cnr <- new_cnr %>% 
  right_join(get_load_new(file_name = "revision_p0.RData")$preds_test ,
            by ='id_var') %>% 
  mutate(prob_for_report = preds_after_bayes,
         UTL_f365d_hospUnplanned_admDay = UTL_f365d_hospUnplanned_admDays) %>%
  add_grouped_cancer_type() %>% 
  add_age_quantiles() %>% 
  mutate(DMG_died_within_365d = factor(DMG_died_within_365d),
         DMG_age = as.numeric(DMG_age)) %>% 
  data.table() 

new_test_cnr <- get_test_data(new_test_cnr)
new_test_cnr[,S_index_date_XX:=as.Date(S_index_date_XX)]

# build data of hospitalizations : 
dt_adm<-get_dt_hosp(top_All_prof_10 = top_All_prof_10)

# build tables for analysis - population and cost - cancer sample 
library(zeallot)
c(dt_for_exhibits_cancer,
  dt_cost_for_exhibits_cancer,
  dt_cost_for_exhibit_byMainCat_cancer ) %<-%  Build_relevant_dt("cancer")

# build tables for analysis - population only - all sample 
dt_for_exhibits_all <-  Build_relevant_dt("all", cost = F)

# build monthly prediction table (person-month)
dt_dynamic <- build_dyn_new(con = con)

# build table with cost and monthly prediction 
dt_cost_for_exhibits_cancer_month <- merge(x = dt_cost_for_exhibits_cancer[, pos := .I],
                                           y = dt_dynamic[,
                                                          .(id_var ,
                                                            dyn_index_date_XX = S_index_date_XX,
                                                            month_from_index,
                                                            dyn_prob = prob_for_report)],
                                           by = "id_var",
                                           all.data=TRUE,
                                           allow.cartesian=TRUE)[,
                                     diff_index :=   as.numeric(difftime(
                                         dyn_index_date_XX ,
                                         cost_date,
                                         units = "days"))  ][diff_index<=0,][
                                           ,min_diff := max(diff_index), by=pos][
                                             min_diff==diff_index][
                                               ,c("min_diff","pos"):=NULL]

library(zeallot)
c(dt_mc,
  dt_mc_mainCat ,
  dt_mc_adm_prof_all ,
  dt_mc_adm_prof ) %<-%  get_monthly_cost(con)


## Analysis --------------------------------------------------------------------
## OO FIGURES OO-------------------------------------------------------------------------------------------

# Spending  Concentration, Different Subpopulations  -----------------------------------------------------
fig_concentration(data_cnr = new_cnr %>% data.table())


# Spending by Predicted Mortality  ------------------------------------------------------------------------
# Cancer sample 
fig_Spending_byPredictedMortality(data_all  = dt_for_exhibits_all,
           data_cnr = new_test_cnr)

#General population sample  (appendix)
fig_Spending_byPredictedMortality(data_all  = dt_for_exhibits_all,
           data_cnr = new_test_cnr ,
           sample_n = "all")

# Spending by Predicted Mortality - Grouped by Mortality Outcome ----------------------------------------
# Cancer sample
fig_MonthlySpending_byPhat_andDeath(data_all =  dt_for_exhibits_all ,
           data_cnr =new_test_cnr,
           sample_n = "cancer"  )

# GP sample (appendix)
fig_MonthlySpending_byPhat_andDeath(data_all =  dt_for_exhibits_all ,
           data_cnr =new_test_cnr,
           sample_n = "all"  )

# cancer Decedent Spending By Time Before Death and After Diagnosis ---------------------------------------
# 1) low/ high Intensity 
fig_cancer_Decedent_Spending_By_Time_a( data = new_test_cnr,
             data_cost = dt_cost_for_exhibits_cancer %>% 
                      mutate(new_cat = factor(case_when(main_cat%in%c("Inpatient_Planned" ,"Inpatient_Unplanned") &
                                                  profession%in%intensity_wards ~ "Low", 
                                              main_cat%in%c("Inpatient_Planned" ,"Inpatient_Unplanned") &
                                                  !profession%in%intensity_wards ~ "High",
                                              TRUE ~ as.character(main_cat)))) %>% data.table(),
              intensity_wards =intensity_wards )

fig_cancer_Decedent_Spending_By_Time_b( data = new_test_cnr,
             data_cost = dt_cost_for_exhibits_cancer %>% 
               mutate(new_cat = factor(case_when(main_cat%in%c("Inpatient_Planned" ,"Inpatient_Unplanned") &
                                                   profession%in%intensity_wards ~ "Low", 
                                                 main_cat%in%c("Inpatient_Planned" ,"Inpatient_Unplanned") &
                                                   !profession%in%intensity_wards ~ "High",
                                                 TRUE ~ as.character(main_cat))))  %>% data.table(),
             intensity_wards =intensity_wards )


# 2) planne/ unplanned 
fig_cancer_Decedent_Spending_By_Time_a( data = new_test_cnr,
             data_cost = dt_cost_for_exhibits_cancer %>% 
               mutate(new_cat = main_cat) %>% data.table(),
             intensity_wards =intensity_wards,
            hosp_a = "Inpatient_Unplanned",
            hosp_b = "Inpatient_Planned",
            hosp_a_name = "Inpatient -\nUnplanned", 
            hosp_b_name = "Inpatient -\nPlanned")

fig_cancer_Decedent_Spending_By_Time_b( data = new_test_cnr,
             data_cost = dt_cost_for_exhibits_cancer %>% 
               mutate(new_cat = main_cat) %>% data.table(),
             intensity_wards =intensity_wards,
             hosp_a = "Inpatient_Unplanned",
             hosp_b = "Inpatient_Planned",
             hosp_a_name = "Inpatient -\nUnplanned", 
             hosp_b_name = "Inpatient -\nPlanned")


# 3) procedure based / per-diem 
fig_cancer_Decedent_Spending_By_Time_a( data = new_test_cnr,
             data_cost = (dt_cost_for_exhibits_cancer %>% mutate(new_cat = as.character(main_cat)) %>% 
                            mutate(new_cat =  factor(case_when(category=="differential" ~ "Inpatient_DRG",
                                                               category%in%c("hospitalization_urgent",
                                                                             "hospitalization_elective") ~ "Inpatient_PerDiem",
                                                               TRUE ~ as.character(main_cat )))) %>%  data.table()),
             intensity_wards =intensity_wards,
             hosp_a = "Inpatient_PerDiem",
             hosp_b = "Inpatient_DRG",
             hosp_a_name = "Inpatient -\nPer Diem", 
             hosp_b_name = "Inpatient -\nDRG")

fig_cancer_Decedent_Spending_By_Time_b( data = new_test_cnr,
             data_cost = (dt_cost_for_exhibits_cancer %>% mutate(new_cat = as.character(main_cat)) %>% 
                            mutate(new_cat =  factor(case_when(category=="differential" ~ "Inpatient_DRG",
                                                               category%in%c("hospitalization_urgent",
                                                                             "hospitalization_elective") ~ "Inpatient_PerDiem",
                                                               TRUE ~ as.character(main_cat )))) %>%  data.table()),
             intensity_wards =intensity_wards,
             hosp_a = "Inpatient_PerDiem",
             hosp_b = "Inpatient_DRG",
             hosp_a_name = "Inpatient -\nPer Diem", 
             hosp_b_name = "Inpatient -\nDRG")



# Average Monthly Spending on Cancer Patients, by Type of Service And Intensity ----------------------------
fig_Average_Monthly_Spending_by_Type(dt_exh = new_test_cnr, 
           dt_cst = dt_cost_for_exhibits_cancer %>% 
             mutate(top_main_cat = factor(case_when(main_cat%in%c("Inpatient_Planned" ,"Inpatient_Unplanned") &
                                                 profession%in%intensity_wards ~ "Low", 
                                               main_cat%in%c("Inpatient_Planned" ,"Inpatient_Unplanned") &
                                                 !profession%in%intensity_wards ~ "High",
                                               TRUE ~ "Other"))) %>% data.table(),
           hosp_a = "Low",   #dotted
           hosp_b = "High",  # dashed
           hosp_a_name = "Inpatient :\nLow Intensity", 
           hosp_b_name = "Inpatient :\nHigh Intensity",
           loc = c(2900, 1150, 4200,
                   2300, 6600, 4400,
                   2500, 900, 4200)

           )

# planned
fig_Average_Monthly_Spending_by_Type(dt_exh = new_test_cnr, 
           dt_cst = dt_cost_for_exhibits_cancer %>% 
             mutate(top_main_cat = factor(case_when(main_cat%in%c("Inpatient_Planned" ,"Inpatient_Unplanned") ~ as.character(main_cat), 
                                                    TRUE ~ "Other"))) %>% data.table(),
           hosp_a = "Inpatient_Unplanned", #dotted
           hosp_b = "Inpatient_Planned",   # dashed
           hosp_a_name = "Inpatient \nUnplanned", 
           hosp_b_name = "Inpatient \nPlanned",
           loc = c(2400, 900, 4300,  # b,a,other
                   6000, 4400, 2300,
                   2800, 700, 4400)
  
)

# DRG
fig_Average_Monthly_Spending_by_Type(dt_exh = new_test_cnr, 
           dt_cst = (dt_cost_for_exhibits_cancer %>% 
                       mutate(top_main_cat =  factor(case_when(category=="differential" ~ "Inpatient_DRG",
                                                          category%in%c("hospitalization_urgent",
                                                                        "hospitalization_elective") ~ "Inpatient_PerDiem",
                                                          TRUE ~ "Other"))) %>%  data.table()),
           hosp_a = "Inpatient_PerDiem", #dotted
           hosp_b = "Inpatient_DRG",   # dashed
           hosp_a_name = "Inpatient \nPer Diem", 
           hosp_b_name = "Inpatient \nDRG",
           loc = c(900, 2200, 4300,  # dash,dotthed,other
                   2500, 6000, 4400,
                   900, 2400, 4400))


# Fraction of Admissions Ending in Death Within 60 Days, By Current Predicted Mortality  -------------

# based on monthly prediction (nearest pred. to admission)
dt_for_fig_60_days <- build_adm_month(dt_hosp_raw_1_row = dt_adm[max_cost_row==1],
                                dt_dyn = dt_dynamic)

do_figure60days(dt_for_fig_60_days )

# Alternative Figure 6 with and without EMR variablles, for admissions in Clalit-owned hospitals -  alternative with EMR
all1year<-do_figure60_days_revision(con = getMechkarConnection(),
                                path ="",
                                file_name  = "revision_clalit_all_model" ,
                                file_name_no_adm  = "revision_clalit_all_model_noadm" ,
                                intensity_wards = c("oncology","internal_medicine","geriatry","rehabilitation"),
                                period = "1 Year") 


clalit_adm_all_pred <-  get_load_new(file_name = "revision_clalit_all_model.RData")

calibration_plot(clalit_adm_all_pred$preds_test,'preds_after_bayes','true_value',20) +
  ggsave(filename = "calib_clalit_all_features.pdf", w = 10, h=7)

clalit_adm_all_pred$feature_importance %>% slice_head(n=50)  %>%
  write.csv("feature_importance_all_features.csv")

# Alternative Figure 6 with and without EMR variablles, for admissions in Clalit-owned hospitals  - alternative without EMR

do_figure60_days_revision(con = getMechkarConnection(),
                    path ="",
                    file_name  = "revision_all_model_noadm" ,
                    intensity_wards = c("oncology","internal_medicine","geriatry","rehabilitation"),
                    compare = F,
                    period = "1 Year (Without EMR Data)") 

clalit_adm_noEMR_pred <-  get_load_new(file_name = "revision_all_model_noadm.RData")

calibration_plot(clalit_adm_noEMR_pred$preds_test,'preds_after_bayes','true_value',20) +
  ggsave(filename = "calib_noEMR_features.pdf", w = 10, h=7)

clalit_adm_noEMR_pred$feature_importance %>% slice_head(n=50)  %>%
  write.csv("feat_importance_noEMR_features.csv")

# based on events prediction :
do_figure60_days_events()

# Spending and Mortality of Decedendts and Survivors, by Age Quintiles --------------------------------
MonthlySpending_byPhat_andDeath_ageQ(data = new_test_cnr)

# EVENTS : -------------------------------------------------------------------------------------
event_desc<- do_event_describe()
do_events_tree(con = con )
# outputs:
# events_tree.csv , events_tree_median_age.csv,
# events_AUC.csv  , events_feature_importance.csv

# OO TABLES OO-----------------------------------------------------------------------------------------
# Demographics, Cost and Mortality -------------------------------------------------------------------
table_1<- do_table_1(data_cnr = data.table(new_cnr %>%
                                             mutate( DMG_clinic_ethnicity = fct_explicit_na(as.character(DMG_clinic_ethnicity),"Missing"),
                                                     ACG_RUB = fct_explicit_na(as.character(ACG_RUB),"Missing"),
                                                     
                                                     COV_smoking_code = fct_explicit_na(as.character(COV_smoking_code),"Missing"),
                                                     DMG_ses_by_clinic = fct_explicit_na(as.character(DMG_ses_by_clinic),"Missing")
                                             )
                                           )[,  obs_uniq_ident := .GRP,
                                                 by = .(id_var,
                                                        S_index_date_XX,
                                                        S_sample_source_XX)],
                     min_age = 25)
 

# Average Monthly Spending of Cancer Patients----------------------------------------------------------
original_table_2_month   <-full_table_2("initial_risk_bins_month",
                                               data_probs = dt_dynamic,
                                               type_phat_zero=T,
                                               type_reweight_oneway = F, 
                                               parm_bins = 0.1, num_of_bins = 10)

# Average Monthly Spending of Cancer Patients with Alternative Admission Grouping (table a14) ----------------------------
# function from figure 60_days script : 
dt_adm_month <- build_adm_month(dt_hosp_raw_1_row = dt_adm,
                                dt_dyn = dt_dynamic)

# function from #table_2_procedure
dt_adm_proc <- paste_adm_proc(con= con)

# Admission Intensity, by Main Theraeutic Procedure (a12 )----------------------------------------------
# Admission Intensity, by Type of Billing  (a13) ------------------------------------------------------
do_table_2_procedure(dt_adm_proc)

dt_adm_proc_month <- build_adm_month(dt_hosp_raw_1_row = data.table(paste_adm_proc(con= con , data_adm = dt_adm    )),
                                     dt_dyn = dt_dynamic)

table_A14    <-do_Alternative_Adm_Grouping("table_A14",
                                  data_probs = dt_dynamic,
                                  type_phat_zero=T,
                                  type_reweight_oneway = F, 
                                  parm_bins = 0.1, num_of_bins = 10)


# source(paste0(revision_path,'hosp_procedure_code.R'))
# output: hosp_procedure_ward,  hosp_procedure_peula


# Monthly Admission Statistics for Cancer Patients , Survivor Statistics Reweighted by Initial Prognoses (a17) --------------------------------------------------
#annual admissions statistics for cancer patients, classifying admissions based on ward intensity 

do_table_3(dt_exh = new_test_cnr)

# annual admissions statistics for cancer patients, classifying admissions based on biliing method

do_table_3(dt_exh = new_test_cnr ,
           dt_cost = (dt_cost_for_exhibits_cancer %>% mutate(main_cat = as.character(main_cat)) %>% 
                          mutate(profession =  factor(case_when(category=="differential" ~ "Inpatient_DRG",
                                                              category%in%c("hospitalization_urgent",
                                                                            "hospitalization_elective") ~ "Inpatient_PerDiem",
                                                              TRUE ~ main_cat))) %>% 
                          mutate(main_cat = factor(main_cat)) %>% data.table()),
           low_intensity =  c("Inpatient_PerDiem"),
           sample_n = "low_per_diem_high_DRG" )

# annual admissions statistics for cancer patients, classifying admissions to planned and unplanned

do_table_3(dt_exh = new_test_cnr ,
           dt_cost = (dt_cost_for_exhibits_cancer %>% mutate(profession =  main_cat)  %>% data.table()),
           low_intensity =  c("Inpatient_Unplanned"),
           sample_n = "low_Unplanned_high_planned" )

# Monthly Admission Statistics for Cancer Patients --------------------------------------------------
table_3_month<- do_table_3_month()

# Inpatient Procedures by Admission Time Before death ----------------------------------------------
table_4 <- do_table_4()

# Average Monthly Spending of Cancer Patients, by Age Quintile -------------------------------------
t5c<-do_table_5_month(data = dt_dynamic,
                 name = "by_month",
                 one_way = F )



### OO APPENDIX FIGURES OO ------------------------------------------------------------------------
# The Share of Total Adjusted Spending Accounted for by Individuals with Different Prognoses ------
do_concentration_100_bins(data = new_test_cnr)

# Spending and Mortality, Separately by Cancer Type and Age Quintile ------------------------------
do_MonthlySpending_byPhat_andDeath_ageQ_CancerType(data = new_test_cnr)

# Predictive Model Fit ---------------------------------------------------------------------------
do_Predictive_Model_Fit(data_cnr  = new_test_cnr)

# Model Calibration, by Cancer Sample Age Group -------------------------------------------------
new_test_cnr  %>%
  mutate(bins = gtools::quantcut(prob_for_report,
                                 seq(0,1,1/20),
                                 right = TRUE,
                                 labels = as.character(seq(0.01,1,1/20)))) %>% 
  group_by(bins,DMG_age_quintiles) %>% 
  summarise(actual_rate   = mean(DMG_died_within_365d == "1"),
            pred_mortality = mean(prob_for_report)) %>%
  bind_rows(new_test_cnr  %>%
              mutate(bins = gtools::quantcut(prob_for_report,
                                             seq(0,1,1/20),
                                             right = TRUE,
                                             labels = as.character(seq(0.01,1,1/20)))) %>% 
              group_by(bins) %>% 
              summarise(actual_rate   = mean(DMG_died_within_365d == "1"),
                        pred_mortality = mean(prob_for_report)) %>%
              mutate(DMG_age_quintiles = "All")) %>%
  ggplot(data = .) +
  geom_point(aes(x = pred_mortality, 
                 y = actual_rate),
             size = 3) +
  geom_abline(intercept = 0, slope = 1, linetype = 3) +
  scale_x_continuous(limits = c(0,1),breaks=seq(0, 1, 0.5)) +
  scale_y_continuous(limits = c(0,1),breaks=seq(0, 1, 0.2)) +
  labs(x = "Average Initial Prognosis (Predicted One-Year Mortality Risk)",
       y = "Actual One-Year Mortality Rate",
       caption = paste0(20," bins") )+ 
  facet_wrap(~DMG_age_quintiles) +
  theme(aspect.ratio = 1) + 
  ggsave("calib_age_group.pdf", w = 10, h=7)   

# One-Year Mortality Risk Distribution, Predicted Over Time ------------------------------------
plot_box_dynamic(data = dt_dynamic )


### OO APPENDIX TABLES OO ----------------------------------------------------------------------
# Admission Characteristics by Hospital Ownership ----------------------------------------------
table_A1 <- do_Adm_Char_by_Hospital_Ownership(dt_hosp = dt_adm[max_cost_row ==1 ],
            dt_hosp_lookup = get_clalit_lookup() ,
            data = new_test_cnr %>% 
              mutate(DMG_clinic_ethnicity = fct_explicit_na(as.character(DMG_clinic_ethnicity),"Missing"),
                     ACG_RUB = fct_explicit_na(as.character(ACG_RUB),"Missing")
              ) %>% data.table() 
            )


# Admission Intensity, by Ward (a2 ) ------------------------------------------------------------------
# Admission Intensity, by Ward and Mortality Status (a9) ----------------------------------------------

# this function return table for all sanple , and by Mortality Status 
table_A2<- do_top_10_wards_intensity_desc(con = con,
                       dt_hosp_raw = dt_adm  )

# Additional Descriptive Statistics (a3) --------------------------------------------------------------
# we use this vector when creating the events tree  analysis 
age_cancer_type <- do_table_mainTopo_descStats(data_cnr = new_test_cnr)[7:27, .(Sample,med_age)]


# 8 Versions of "table 2" : ----------------------------
# a) bins by level/distribution 
# b) rewweighting by risk/risk-month
# c) risk initial/current

# Average Monthly Spending of Cancer Patients, Survivor spending Reweighted by Initial Prognoses (a16) ---------------------------
original_table_2          <-full_table_2("initial_risk_bins",
                                         data_probs = dt_dynamic,
                                         type_phat_zero=T,
                                         parm_bins = 0.1, num_of_bins = 10)

original_table_2_month


original_table_A6  <-full_table_2("dynamic_risk_bins",
                                         data_probs = dt_dynamic,
                                         type_phat_zero=F,
                                  parm_bins = 0.1, num_of_bins = 10)

# Average Monthly Spending of Cancer Patients, Reweighted by Current Risk ------------------------------------
original_table_A6_month  <-full_table_2("dynamic_risk_bins_month",
                                  data_probs = dt_dynamic,
                                  type_phat_zero=F,
                                  type_reweight_oneway = F, 
                                  parm_bins = 0.1, num_of_bins = 10)

## alternative specifications for reweighting of survivor spending - bins by risk level (0,0,1,…0.9,1) -----------
res_level_binning <-rbind(original_table_2[[1]][c(1,2,5),],
                          original_table_2_month[[1]][c(1,2,5),],
                          original_table_A6[[1]][c(1,2,5),],
                          original_table_A6_month[[1]][c(1,2,5),])


res_level_binning[Category=="All Inpatient:", Category:="~~All Inpatient"]

invisible(Hmisc::latex(
  res_level_binning,
  file = paste0("ave_monthly_cost_risk_bins_variants_combined.tex"),
  center = 'centering',
  n.cgroup = c(1,2, 1 ,2 ),
  cgroup   = c("","Survivor", "Decedent", "Difference"),
  extracolheads = c("","(1)", "(2)", "(3)", "(4)","(5)"),
  n.rgroup = c(3, 3, 3 , 3),
  rgroup = c("A. Reweighting by initial risk",
             "B. Reweighting by initial risk and month",
             "C. Reweighting by current risk",
             "D. Reweighting by current risk and month"),
  colheads = c("\\thead{Category}",
               "\\thead{Unweighted}",
               "\\thead{Reweighted by\\\\Decedent Risk}", 
               "\\thead{Adjusted for \\\\ Survival\\\\Duration}",
               "\\thead{Decedent -\\\\ Survivor\\\\(Reweighted)}",
               "\\thead{Percent of \\\\ Total Difference}"),
  rowname =NULL,
  col.just = c("l", rep.int("r", 5)), 
  extracolsize = "normalsize"
))

# Average monthly spending - comparison of 4 table 2 variants compbind -----------------------

aaa<- full_table_2("initial_risk_distribution_bins",
                                         data_probs = dt_dynamic,
                                         type_phat_zero=T ,
                                         type_binning_level=F,
                                         type_reweight_oneway = T,
                   parm_bins = 0.1, num_of_bins = 10)

bbb<- full_table_2("initial_risk_distribution_bins_month",
                                  data_probs = dt_dynamic,
                                  type_phat_zero=T ,
                                  type_binning_level=F,
                                  type_reweight_oneway = F,
                   parm_bins = 0.1, num_of_bins = 10)
ccc<-full_table_2("dynamic_risk_distribution_bins",
                                  data_probs = dt_dynamic,
                                  type_phat_zero=F ,
                                  type_binning_level=F,
                                  type_reweight_oneway = T,
                                   parm_bins = 0.1, num_of_bins = 10)

ddd<-full_table_2("dynamic_risk_distribution_bins_month",
                                  data_probs = dt_dynamic,
                                  type_phat_zero=F ,
                                  type_binning_level=F,
                                  type_reweight_oneway = F,
                                  parm_bins = 0.1, num_of_bins = 10)

#alternative specifications for reweighting of survivor spending - bins by distribution of risk--------------

res<-rbind(aaa[[1]][c(1,2,5),],
           bbb[[1]][c(1,2,5),],
           ccc[[1]][c(1,2,5),],
           ddd[[1]][c(1,2,5),])


res[Category=="All Inpatient:", Category:="~~All Inpatient"]

invisible(Hmisc::latex(
  res,
    file = paste0("ave_monthly_cost_distribution_bins_variants_combined.tex"),
  center = 'centering',
  n.cgroup = c(1,2, 1 ,2 ),
  cgroup   = c("","Survivor", "Decedent", "Difference"),
  extracolheads = c("","(1)", "(2)", "(3)", "(4)","(5)"),
  n.rgroup = c(3, 3, 3 , 3),
  rgroup = c("A. Reweighting by initial risk",
             "B. Reweighting by initial risk and month",
             "C. Reweighting by current risk",
             "D. Reweighting by current risk and month"),
  colheads = c("\\thead{Category}",
               "\\thead{Unweighted}",
               "\\thead{Reweighted by\\\\Decedent Risk}", 
               "\\thead{Adjusted for \\\\ Survival\\\\Duration}",
               "\\thead{Decedent -\\\\ Survivor\\\\(Reweighted)}",
               "\\thead{Percent of \\\\ Total Difference}"),
  rowname =NULL,
  col.just = c("l", rep.int("r", 5)), 
  extracolsize = "normalsize"
))

# Number of Cases by Predicted Mortality Risk and Time Since Diagnosis --------------------
# Average Monthly Spending by Predicted Mortality Risk and Time Since Diagnosis -------------------------

fig3_phat0_3d         <-full_table_2("revision_table_2_phat",
                                         data_probs = dt_dynamic,
                                         type_phat_zero=T,
                                         type_binning_level=F,
                                         type_reweight_oneway = T,
                                         parm_bins = 0.1,
                                         num_of_bins = 10)
fig3_phat0_3d         <-full_table_2("revision_table_2_phat_month",
                                     data_probs = dt_dynamic,
                                     type_phat_zero=T,
                                     type_binning_level=F,
                                     type_reweight_oneway = F,
                                     parm_bins = 0.1,
                                     num_of_bins = 10)
fig3_phat_dynamic_3d         <-full_table_2("revision_table_2",
                                         data_probs = dt_dynamic,
                                         type_phat_zero=F,
                                         type_binning_level=T,
                                         type_reweight_oneway = F,
                                         parm_bins = 0.1,
                                         num_of_bins = 10)

add_all <- function(data ){
  rbind(data[,-c("weights", "weights_Decedent")],
        data[,.(sum_cost = sum(sum_cost),
                sum_days = sum(sum_days),
                N=sum(N)),
             by = .(bins,time)][,
                                `:=`(group="All", cost=(sum_cost/sum_days)*30)])[N>10] 
}

add_all(fig3_phat0_3d[[3]])  %>% write_csv("fig3_phat0_3d.csv")
add_all(fig3_phat_dynamic_3d[[3]])  %>% write_csv("fig3_phat_dynamic_3d.csv")

# Decedent and Matched Survivor Spending  (table 2 matching -----------------------------------------------------------------
match_res<- do_match( data_probs = dt_dynamic)
do_match_describe(match_res, data_probs =  dt_dynamic)

table_2_matched <- print_table_2_matched(pairs =match_res,
                      probs = dt_dynamic)


# Demographics, Cost and Mortality, 25+ Sample ----------------------------------------------------------
table_a7<- do_table_1(data_cnr = data.table(new_cnr %>%
                                             mutate( DMG_clinic_ethnicity = fct_explicit_na(as.character(DMG_clinic_ethnicity),"Missing"),
                                                     ACG_RUB = fct_explicit_na(as.character(ACG_RUB),"Missing"),
                                                     COV_smoking_code = fct_explicit_na(as.character(COV_smoking_code),"Missing"),
                                                     DMG_ses_by_clinic = fct_explicit_na(as.character(DMG_ses_by_clinic),"Missing")
                                             ) )[,
                                                 obs_uniq_ident := .GRP,
                                                 by = .(id_var,
                                                        S_index_date_XX,
                                                        S_sample_source_XX)],
                     min_age = 25)

invisible(Hmisc::latex(
  table_a7[,1:3],
  file = paste0("desc_stats_Age65p.tex"),
  center = 'centering',
  n.cgroup = c(3),
  cgroup   = c("General Population Sample"),
  colheads = c("All", "Decedent", "Survivor"),
  n.rgroup = c(4, 3, 2,3, 1),
  rgroup   = c("Characteristics",
               "Mortality Rate",
               "\\thead{Utilization\\\\12 Months Before Initial Prognosis}",
               "12 Months After Initial Prognosis",
               ""),
  rowname = c("Age (mean)", "Female (\\%)", "High Socioeconomic Status (\\%)", "Supplementary Insurance (\\%)"
              , "1 month (\\%)", "1 year (\\%)",  "3 years (\\%)",
              "Average Monthly Spending (NIS)", "Any admission (\\%)", 
              "Average Monthly Spending  (Unadjusted NIS)",
              "Average Monthly Spending  (Adjusted NIS)",
              "Any admission (\\%)",
              "Number of Beneficiaries"),
  rowlabel = "",  
  col.just = c(rep.int("r", 3)),
  extracolheads = c("(1)", "(2)", "(3)"),
  na.blank = TRUE, 
  extracolsize = "normalsize"
))

#  Procedures in Planned and Unplanned Inpatient, by Admission Time Before Death
tab_Procedures_planned_intensity()

# Select Predictors ( table A11):----------------------------------------------------------------------
table_a11<- do_Select_Predictors(data_cnr = data.table(new_cnr %>% 
                           mutate(DMG_clinic_ethnicity = fct_explicit_na(as.character(DMG_clinic_ethnicity),"Missing"),
                                  ACG_RUB = fct_explicit_na(as.character(ACG_RUB),"Missing")
                                  ) )[,
                                          obs_uniq_ident := .GRP,
                                          by = .(id_var,
                                                 S_index_date_XX,
                                                 S_sample_source_XX)])

#   N by group --------------------------------------------------------------------------------------------
do_N_by_sample(data_cnr = data.table(new_cnr %>%
                                       mutate(hashed_id_num = gsub("[^0-9]", "", openssl::md5(as.character(id_var)))
                                              %>% substr(1,4)
                                              %>% as.numeric(),
                                              DMG_died_within_365d = fct_explicit_na(as.character(DMG_died_within_365d),"Missing")
                                       ) 
)
)

# describe monthly resuls  ------------------------------------------------
monthly_result<- get(load('revision_monthly.RData'))

monthly_result %>%  map(~chuck(., 'feature_importance') %>% slice_head(n=50) ) %>%
  bind_rows(.,.id = "month") %>% 
  write.csv("feat_importance_monthly_prediction.csv")

monthly_result %>%  map(~chuck(., 'roc_auc') %>% select(.estimate) )  %>% 
  unlist(use.names = T)  %>% data.frame() %>% mutate(month = 1:12) %>% 
  rename("auc"=".") %>% 
  ggplot(.,aes(x=factor(month), y = auc))+
  geom_point()+
  geom_text(aes(label = round(auc,3)), size = 5, nudge_y = 0.01)+
  ylim(0.8,1)+
  labs(y = "AUC" , x= "Month") +
  ggsave("auc_monthly.pdf", w = 10, h=7)

#calibrate_monthly.pdf
monthly_result %>%  map(~chuck(., 'preds_test')  ) %>% 
  bind_rows(.,.id = "month") %>% 
  mutate(bins = gtools::quantcut(preds_after_bayes,
                                 seq(0,1,1/20),
                                 right = TRUE,
                                 labels = as.character(seq(0.01,1,1/20)))) %>% 
  group_by(bins,month) %>% 
  summarise(actual_rate   = mean(true_value == "1"),
            pred_mortality = mean(preds_after_bayes,)) %>% 
  mutate(month = gsub("[^0-9\\.]", "", month)) %>% 
  mutate(month = if_else(month =="",1,as.numeric(month))) %>% 
  arrange(month) %>%
  ggplot(data = .) +
  geom_point(aes(x = pred_mortality, 
                 y = actual_rate),
             size = 3) +
  geom_abline(intercept = 0, slope = 1, linetype = 3) +
  scale_x_continuous(limits = c(0,1),breaks=seq(0, 1, 0.5)) +
  scale_y_continuous(limits = c(0,1),breaks=seq(0, 1, 0.2)) +
  labs(x = "Average Predicted Mortality Risk",
       y = "Actual Mortality Rate",
       caption = paste0(20," bins") )+ 
  facet_wrap(~month) +
  theme(aspect.ratio = 1) + 
  ggsave("calib_monthly.pdf", w = 10, h=7)

# OO build ALL sample (general poulation exhibits using full cost data) OO  ------------------------------------------------
c(dt_for_exhibits_all,
  dt_cost_for_exhibits_all,
  dt_cost_for_exhibit_byMainCat_all )  %<-%  Build_relevant_dt("all")

# Average Monthly Spending of All 25+ Patients (a4) ---------------------------------------------------
do_Ave_Monthly_Spending_All()

# Average Monthly Spending of All 65+ Patients (a5)  --------------------------------------------------
do_Ave_Monthly_Spending_All(data_cost = dt_cost_for_exhibits_all[id_var %in%  
                                                   dt_for_exhibits_all[DMG_age>=65]$id_var],
            data =  dt_for_exhibits_all[DMG_age>=65],
            file_name = "ave_monthly_cost_table2_all65_func.tex")


# Admission Statistics, All Patients (table A10) ----------------------------------------------------------
do_table_3(dt_exh = dt_for_exhibits_all,
           dt_cost = dt_cost_for_exhibits_all,
           sample_n = "all",
           low_intensity = c("oncology","internal_medicine",
                             "geriatry","rehabilitation"))
